package com.icbc.processor;

import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.Random;

/**
 * 文章爬取处理类
 * @author cfei
 * @author kaki
 */
@Component
public class ArticlePageProcessor implements PageProcessor {

    //站点设置
    private Site site = Site.me()
            .setRetryTimes(1) //重试次数
            .setSleepTime(new Random().nextInt(1200) * 100)//休眠时间
            .setTimeOut(10000);//超时时间


    @Override
    public void process(Page page) {
        //递归爬取网页
        page.addTargetRequests(page.getHtml().regex("https://my.oschina.net/u/[0-9]+/blog/[0-9]+").all());

        //获取内容
        String title = page.getHtml().xpath("//*[@id=\"mainScreen\"]/div/div[1]/div/div[2]/div[1]/div[2]/h2/text()").toString();
        String content = page.getHtml().xpath("//*[@id=\"articleContent\"]").toString();

        if (StringUtils.isNoneBlank(title) && StringUtils.isNoneBlank(content)) {
            page.putField("title", title);
            page.putField("content", content);
        } else {
            page.setSkip(true);
        }

    }


    /**
     * 站点信息： 休眠时间、超时时间、重试次数
     */
    @Override
    public Site getSite() {
        return site;
    }
}
